In [1]:
import warnings
warnings.filterwarnings("ignore")
Configuration¶
In [2]:
import scanpy as sc
import scvi
import anndata as ad
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
# Make sure plots appear inline in notebooks
%matplotlib inline
# Set Scanpy plotting settings
sc.settings.set_figure_params(dpi=100, facecolor='white', frameon=False)
# Set scvi-tools settings
scvi.settings.seed = 0
print("scvi-tools version:", scvi.__version__)
print("PyTorch version:", torch.__version__)
# --- Configuration ---
N_HVG = 3000 # Number of highly variable genes to select
N_NEIGHBORS = 15 # Number of neighbors for KNN graph
N_PCS = 30 # Number of PCs for KNN graph (used before scVI)
LEIDEN_RESOLUTION = 0.6 # Resolution for Leiden clustering
SCVI_MAX_EPOCHS = 400 # Maximum training epochs for scVI (can be lowered for speed)
Seed set to 0
scvi-tools version: 1.3.0 PyTorch version: 2.6.0+cu124
In [3]:
sampleid = "Autopsy1"
results_dir = "./102_Scvi_visium_results_"+sampleid+"/"
os.makedirs(results_dir, exist_ok=True)
adata = sc.read_h5ad("./101_Preprocess_processed_adata/QC_processed_"+sampleid+".h5ad")
print("Original AnnData object:")
print(adata)
Original AnnData object:
AnnData object with n_obs × n_vars = 626 × 18074
obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'pass_qc'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
uns: 'spatial'
obsm: 'spatial'
Step 1: Preserving raw counts...¶
In [4]:
if 'counts' not in adata.layers:
adata.layers['counts'] = adata.X.copy()
else:
print("Layer 'counts' already exists. Assuming it contains raw counts.")
Step 2: Basic Quality Control and Filtering¶
In [5]:
# 2a. Filter out spots not 'in_tissue' (usually done by default by spaceranger/read_visium)
if 'in_tissue' in adata.obs.columns:
n_spots_before = adata.n_obs
adata = adata[adata.obs['in_tissue'] == 1, :]
print(f" Filtered out {n_spots_before - adata.n_obs} spots not in tissue.")
else:
print(" 'in_tissue' column not found, assuming all spots are in tissue.")
Filtered out 0 spots not in tissue.
In [6]:
adata.layers['counts'] = adata.X.copy() # Ensure layer 'counts' has filtered raw counts
adata.raw = adata.copy() # Store the filtered data with raw counts in .X into .raw
Step 3: QC¶
In [7]:
sc.pl.violin(adata, ["n_genes_by_counts", "total_counts", "pct_counts_mt"], jitter=0.4, multi_panel=True)
Step 4: Prepare Data for scVI¶
In [8]:
# 4a. Normalize total counts per spot and log-transform (for HVG selection and visualization)
adata_for_hvg = adata.copy() # Work on a copy for HVG selection
sc.pp.normalize_total(adata_for_hvg, target_sum=1e4)
sc.pp.log1p(adata_for_hvg)
In [9]:
# 4b. Identify Highly Variable Genes (HVGs) using the log-normalized data
print(f" Selecting top {N_HVG} Highly Variable Genes...")
sc.pp.highly_variable_genes(
adata_for_hvg,
n_top_genes=N_HVG,
subset=False, # Don't subset yet, just mark them
flavor="seurat_v3", # Common flavor, works well
layer=None # Use adata_for_hvg.X which is log-normalized
)
Selecting top 3000 Highly Variable Genes...
In [10]:
# Transfer the HVG information back to the original adata object
adata.var['highly_variable'] = adata_for_hvg.var['highly_variable']
adata.var['highly_variable_rank'] = adata_for_hvg.var['highly_variable_rank']
adata.var['means'] = adata_for_hvg.var['means']
adata.var['variances'] = adata_for_hvg.var['variances']
adata.var['variances_norm'] = adata_for_hvg.var['variances_norm']
print(f" Marked {adata.var['highly_variable'].sum()} genes as highly variable.")
Marked 3000 genes as highly variable.
Step 5: Setup and Train scVI Model¶
In [11]:
# 5a. Setup AnnData object for scvi-tools
# Tell scVI to use the raw counts stored in the 'counts' layer
# It will automatically use the 'highly_variable' column in adata.var if setup correctly
scvi.model.SCVI.setup_anndata(
adata,
layer="counts", # Use the raw counts layer we created
# Optional: Add batch key if needed, e.g., batch_key="sample_id"
# Optional: Add categorical covariates if relevant, e.g., categorical_covariate_keys=["cell_type"]
)
In [12]:
# 5b. Initialize the scVI model
# n_latent: Dimensionality of the latent space (adjust if needed, 10-30 often works well)
# n_layers: Number of hidden layers in the neural network (1 or 2 is common)
vae = scvi.model.SCVI(adata, n_layers=2, n_latent=30, gene_likelihood="zinb") # ZINB recommended for UMI counts
In [13]:
# 5c. Train the scVI model
# use_gpu=USE_GPU will automatically use GPU if available and specified
# plan_kwargs can control learning rate, weight decay, etc.
# check_val_every_n_epoch=10 helps with early stopping monitoring
print(f" Training scVI model for up to {SCVI_MAX_EPOCHS} epochs...")
vae.train(max_epochs=SCVI_MAX_EPOCHS, plan_kwargs={'lr': 1e-3}, check_val_every_n_epoch=10)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Training scVI model for up to 400 epochs...
`Trainer.fit` stopped: `max_epochs=400` reached.
In [14]:
# Extract ELBO loss from training history
history = vae.history
elbo_train = history["elbo_train"] # Training loss
elbo_validation = history["reconstruction_loss_train"] # Validation loss
# Plot convergence curve
plt.figure(figsize=(6, 4))
plt.plot(elbo_train, label="Training Loss", color="blue")
plt.plot(elbo_validation, label="Validation Loss", color="red", linestyle="dashed")
plt.xlabel("Epochs")
plt.ylabel("Negative ELBO")
plt.title("scVI Training Convergence")
plt.legend()
plt.savefig(os.path.join(results_dir, "scvi_training_elbo.png"))
Step 6: Post-Training Analysis - Latent Space, Clustering, Visualization¶
In [15]:
# 6a. Get the latent representation from the trained model
print(" Extracting scVI latent representation...")
adata.obsm["X_scVI"] = vae.get_latent_representation()
Extracting scVI latent representation...
In [16]:
# 6b. Perform clustering on the scVI latent space
print(" Performing Leiden clustering on scVI latent space...")
sc.pp.neighbors(adata, n_neighbors=N_NEIGHBORS, use_rep="X_scVI")
sc.tl.leiden(adata, resolution=LEIDEN_RESOLUTION, key_added="leiden_scvi", flavor="igraph", n_iterations=2)
Performing Leiden clustering on scVI latent space...
In [17]:
# 6c. Compute UMAP embedding based on the scVI latent space
print(" Computing UMAP embedding...")
sc.tl.umap(adata, min_dist=0.3) # min_dist controls spread
Computing UMAP embedding...
In [18]:
# 6d. Visualize the results
print(" Generating visualizations...")
# UMAP colored by cluster
sc.pl.umap(adata, color="leiden_scvi", title="UMAP colored by scVI Leiden Clusters", show=True)
Generating visualizations...
In [19]:
# Spatial plot colored by cluster
sc.pl.spatial(adata, color="leiden_scvi", title="Spatial - scVI Leiden Clusters", show=True, spot_size=140) # Adjust spot_size as needed
In [20]:
# Visualize QC metric spatially (optional)
sc.pl.spatial(adata, color="total_counts", title="Spatial - Total Counts", show=True, spot_size=140, cmap="jet")
In [21]:
sc.pl.spatial(adata, color="n_genes_by_counts", title="Spatial - Genes per Spot", show=True, spot_size=140)
Step 7: Differential Gene Expression (DGE) using scVI¶
In [22]:
de_df = vae.differential_expression(groupby="leiden_scvi")
In [23]:
de_df
Out[23]:
| proba_de | proba_not_de | bayes_factor | scale1 | scale2 | pseudocounts | delta | lfc_mean | lfc_median | lfc_std | ... | raw_mean1 | raw_mean2 | non_zeros_proportion1 | non_zeros_proportion2 | raw_normalized_mean1 | raw_normalized_mean2 | is_de_fdr_0.05 | comparison | group1 | group2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| C7 | 0.8830 | 0.1170 | 2.021151 | 1.791373e-03 | 5.300365e-04 | 0.000844 | 0.25 | 2.088022 | 2.028671 | 1.375517 | ... | 5.753423 | 1.445835 | 0.965753 | 0.568750 | 21.983667 | 5.805844 | False | 0 vs Rest | 0 | Rest |
| C3 | 0.8760 | 0.1240 | 1.955084 | 2.670462e-03 | 7.671725e-04 | 0.000844 | 0.25 | 2.421173 | 2.201752 | 1.843161 | ... | 8.924650 | 2.091667 | 0.979452 | 0.600000 | 32.602879 | 8.538132 | False | 0 vs Rest | 0 | Rest |
| FBLN1 | 0.8758 | 0.1242 | 1.953245 | 2.285606e-03 | 5.602018e-04 | 0.000844 | 0.25 | 2.595059 | 2.547770 | 1.816693 | ... | 6.986298 | 1.462502 | 0.972603 | 0.527083 | 25.724874 | 5.992639 | False | 0 vs Rest | 0 | Rest |
| PLA2G2A | 0.8726 | 0.1274 | 1.924145 | 4.756092e-03 | 1.502925e-03 | 0.000844 | 0.25 | 2.000116 | 1.998978 | 1.452615 | ... | 16.020544 | 4.106248 | 0.972603 | 0.766667 | 62.787144 | 17.764256 | False | 0 vs Rest | 0 | Rest |
| DCN | 0.8724 | 0.1276 | 1.922348 | 5.135771e-03 | 1.805746e-03 | 0.000844 | 0.25 | 1.899670 | 1.835069 | 1.392366 | ... | 17.575333 | 5.349997 | 0.986301 | 0.818750 | 64.021118 | 20.008612 | False | 0 vs Rest | 0 | Rest |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| CLCNKB | 0.0000 | 1.0000 | -0.000000 | 9.448550e-08 | 2.631730e-07 | 0.000979 | 0.25 | -0.186752 | -0.120852 | 0.325800 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | False | 4 vs Rest | 4 | Rest |
| SPDYA | 0.0000 | 1.0000 | -0.000000 | 4.606433e-08 | 1.267487e-07 | 0.000979 | 0.25 | -0.099041 | -0.053533 | 0.203120 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | False | 4 vs Rest | 4 | Rest |
| FKBP1C | 0.0000 | 1.0000 | -0.000000 | 9.038656e-08 | 2.369628e-07 | 0.000979 | 0.25 | -0.162961 | -0.099140 | 0.328953 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | False | 4 vs Rest | 4 | Rest |
| SHOX | 0.0000 | 1.0000 | -0.000000 | 4.307828e-08 | 2.357177e-07 | 0.000979 | 0.25 | -0.221644 | -0.137150 | 0.289680 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | False | 4 vs Rest | 4 | Rest |
| CNTN6 | 0.0000 | 1.0000 | -0.000000 | 4.649962e-08 | 1.697153e-07 | 0.000979 | 0.25 | -0.143527 | -0.069640 | 0.245403 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | False | 4 vs Rest | 4 | Rest |
90370 rows × 22 columns
In [24]:
de_df = de_df[de_df.proba_de > 0.85]
de_df = de_df[de_df.lfc_mean > 1.0]
In [25]:
de_df = de_df.sort_values(by=["group1", "proba_de", "lfc_mean"], ascending=[True, False, False])
dge_filename = os.path.join(results_dir, "scvi_differential_expression.csv")
de_df.to_csv(dge_filename)
print(f" Differential expression results saved to '{dge_filename}'")
Differential expression results saved to './102_Scvi_visium_results_Autopsy1/scvi_differential_expression.csv'
In [26]:
print("\n Top markers per cluster (based on scVI DGE):")
n_top_markers = 5
for cluster_id in sorted(de_df['group1'].unique()):
print(f"\n --- Cluster {cluster_id} vs Rest ---")
top_markers = de_df[de_df['group1'] == cluster_id].head(n_top_markers)
if top_markers.empty:
print(" No significant markers found with current thresholds.")
else:
print(top_markers[['proba_de', 'lfc_mean', 'non_zeros_proportion1']]) # Show key stats
Top markers per cluster (based on scVI DGE):
--- Cluster 0 vs Rest ---
proba_de lfc_mean non_zeros_proportion1
C7 0.8830 2.088022 0.965753
C3 0.8760 2.421173 0.979452
FBLN1 0.8758 2.595059 0.972603
PLA2G2A 0.8726 2.000116 0.972603
DCN 0.8724 1.899670 0.986301
--- Cluster 1 vs Rest ---
proba_de lfc_mean non_zeros_proportion1
VCAN 0.9618 2.362838 0.991228
BGN 0.9424 2.390476 0.973684
FN1 0.9376 1.743816 0.991228
DEPP1 0.9146 2.012315 0.938596
DKK3 0.8806 3.118816 0.815789
--- Cluster 2 vs Rest ---
proba_de lfc_mean non_zeros_proportion1
FABP4 0.9732 4.615284 0.980519
SCD 0.9676 4.283886 0.941558
PLIN4 0.9380 4.268935 0.850649
G0S2 0.9264 3.944682 0.792208
PLIN1 0.9260 3.846037 0.824675
--- Cluster 3 vs Rest ---
proba_de lfc_mean non_zeros_proportion1
ACTG2 0.9552 3.246862 0.969325
GADD45G 0.9508 3.809372 0.981595
RHOB 0.9392 1.667988 1.000000
CNN1 0.9326 3.781909 0.975460
TPM2 0.9320 2.988886 1.000000
--- Cluster 4 vs Rest ---
proba_de lfc_mean non_zeros_proportion1
APOD 0.8796 2.570791 1.000000
ITGB8 0.8740 4.602444 0.877551
Step 8: Visualizing top marker genes spatially (optional)...¶
In [27]:
# Get some top markers across a few clusters
markers_to_plot = []
for cluster_id in sorted(de_df['group1'].unique())[:3]: # Plot for first 3 clusters
cluster_markers = de_df[de_df['group1'] == cluster_id].head(2).index.tolist()
if cluster_markers:
markers_to_plot.extend(cluster_markers)
markers_to_plot = list(dict.fromkeys(markers_to_plot)) # Unique markers
In [28]:
markers_to_plot
Out[28]:
['C7', 'C3', 'VCAN', 'BGN', 'FABP4', 'SCD']
In [29]:
sc.pl.umap(adata, color = markers_to_plot, cmap='jet')
In [30]:
if markers_to_plot:
print(f" Plotting spatial expression for: {markers_to_plot}")
# Use expression from adata.raw for visualization if desired (raw counts)
# Or use normalized data from adata.X
plt.figure()
sc.pl.spatial(
adata,
color=markers_to_plot,
spot_size=180,
cmap = 'jet',
alpha=1,
ncols=min(len(markers_to_plot), 4), # Adjust layout
# layer='counts', # Uncomment to plot raw counts from the layer
use_raw=True,
show=True,
)
plt.close()
print(f" Marker gene spatial plots saved with prefix in '{results_dir}/figures/'")
else:
print(" No top markers found to plot.")
Plotting spatial expression for: ['C7', 'C3', 'VCAN', 'BGN', 'FABP4', 'SCD']
<Figure size 400x400 with 0 Axes>
Marker gene spatial plots saved with prefix in './102_Scvi_visium_results_Autopsy1//figures/'
Step 9: Saving final AnnData object and scVI model...¶
In [31]:
adata_filename = os.path.join(results_dir, "processed_visium_adata_scvi.h5ad")
adata.write(adata_filename)
print(f" Processed AnnData object saved to '{adata_filename}'")
model_filename = os.path.join(results_dir, "scvi_model")
vae.save(model_filename, overwrite=True)
print(f" Trained scVI model saved to '{model_filename}'")
Processed AnnData object saved to './102_Scvi_visium_results_Autopsy1/processed_visium_adata_scvi.h5ad' Trained scVI model saved to './102_Scvi_visium_results_Autopsy1/scvi_model'